In [1]:
# load the iris dataset as an example
from sklearn.datasets import load_iris
iris = load_iris()
In [2]:
# store the feature matrix (X) and response vectore (y)
X = iris.data
y = iris.target
"Features" are also known as predictors, inputs or attributes. The "reponse" is also known as the target, label or output.
In [3]:
# check the shapes of X and y
print(X.shape)
print(y.shape)
"Observations" are also known as samples, instances, or records.
In [5]:
# examine the first 5 rows of the feature matrix
import pandas as pd
pd.DataFrame(X, columns=iris.feature_names).head()
Out[5]:
In [6]:
y[:5]
Out[6]:
In [8]:
pd.Series(y).value_counts()
Out[8]:
In order to build a model, the features must be numeric, and every observation must have the same features in the same order.
In [10]:
# import the class
from sklearn.neighbors import KNeighborsClassifier
# instantiate the model (with the default parameters)
knn = KNeighborsClassifier()
# fit the model with data (occurs in-place)
knn.fit(X, y)
Out[10]:
In order to make a prediction, the new observation must have the same features as the training observations, both in number and meaning.
In [12]:
# predict the response for a new observation
knn.predict([[3, 5, 4, 2]])
Out[12]:
In [13]:
# example text for model training (SMS messages)
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']
In [14]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(simple_train)
Out[14]:
In [15]:
vect.get_feature_names()
Out[15]:
In [16]:
# transform training data into a 'document-term matrix'
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm
Out[16]:
In [17]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()
Out[17]:
In [18]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())
Out[18]:
In [19]:
# examine the sparse matrix contents
print(simple_train_dtm)
As most documents will typically use a very small subset of the words used in the corpus, the resulting matrix will have many feature values that are zeros (typically more than 99% of them).
For instance, a collection of 10,000 short text documents (such as emails) will use a vocabulary with a size in the order of 100,000 unique words in total while each document will use 100 to 1000 unique words individually.
In order to be able to store such a matrix in memory but also to speed up operations, implementations will typically use a sparse representation such as the implementations available in the scipy.sparse package.
In [20]:
# example text for model testing
simple_test = ["please don't call me"]
In order to make a prediction, the new observation must have the same features as the training observations, both in number and meaning.
In [21]:
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()
Out[21]:
In [31]:
# read file into pandas using a relative path
# alternative: read file into pandas from a URL
# url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
path = 'sms.tsv'
sms = pd.read_table(path, header=None, names=['label', 'message'])
In [32]:
# examine the shape
sms.shape
Out[32]:
In [33]:
# examine the first 10 rows
sms.head(10)
Out[33]:
In [34]:
# examine the calss distribution
sms.label.value_counts()
Out[34]:
In [36]:
# examine the calss distribution
sms.label.value_counts() * 100 / sms.shape[0]
Out[36]:
In [38]:
# convert label to a numerical variable
sms['label_num'] = sms.label.map({
'ham' : 0,
'spam': 1
})
In [39]:
# check that the conversion worked
sms.head(10)
Out[39]:
In [40]:
# define X and y from the SMS data for use with CountVectorizer
X = sms.message
y = sms.label_num
print(X.shape)
print(y.shape)
In [42]:
# split X, y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1087)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
In [43]:
# instantiate the vectorizer
vect = CountVectorizer()
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
# examine the document matrix
X_train_dtm
Out[43]:
In [44]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm
Out[44]:
In [45]:
# import and instatiate a multinomial naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
In [46]:
# train the model using X_train_dtm (timing it)
%time nb.fit(X_train_dtm, y_train)
Out[46]:
In [47]:
# make class prediction for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)
In [48]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)
Out[48]:
In [49]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)
Out[49]:
In [50]:
# print message text for the false positives (ham incorrectly classified as spam)
X_test[ y_test < y_pred_class]
Out[50]:
In [51]:
# print message text for false negatives (spam incorrectly classified as ham)
X_test[y_test > y_pred_class]
Out[51]:
In [52]:
# example false negative
X_test[5037]
Out[52]:
In [53]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob
Out[53]:
In [54]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)
Out[54]:
In [55]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
lreg = LogisticRegression()
In [57]:
# train the model using X_train_dtm
%time lreg.fit(X_train_dtm, y_train)
Out[57]:
In [59]:
# make class predictions using X_test_dtm
y_pred_class = lreg.predict(X_test_dtm)
In [60]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = lreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob
Out[60]:
In [61]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)
Out[61]:
In [62]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)
Out[62]:
In [ ]: